#!/usr/bin/env python
# coding: utf-8

# In[1]:

exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')

# In[2]:


YMD = read_hdf(P_Research / 'Others' / 'Datetime' / 'utils_dates_YMD.h5', 'utils')
dict_intYMD_to_y2kW = YMD.set_index('intYMD').y2kW.to_dict()

# In[3]:

#### WEATHER BREAKS

files = sorted(list((P_GS_Data_Raw / 'PH_Day').glob('*.h5')))
print(len(files))
ls_strYMD_weekday_nonholiday = load_obj(P_GS_Data / 'GS' / 'ls_strYMD_weekday_nonholiday.pkl')
files = [f for f in files if f.stem in ls_strYMD_weekday_nonholiday]
print(len(files))
# Dates in weather range ONLY
files = [f for f in files if (int(f.stem)>=20160101) & (int(f.stem)<=20181231)]
print(len(files))

fuels = ['e5','e10','diesel']

# ### Merge Weather Shock with Price

# In[343]:


weather_type = 'temp'
shock_type = 'neg'
shock_col = {'neg':'ShockNeg','pos':'ShockPos','any':'ShockAny'}[shock_type]
#fuel = 'e5'
res_thld = 10


# In[344]:

for fuel in fuels:

    # Build YM Map: 1020=>[1011,1012,...1020,1021,..,1025]
    df = DataFrame(date_range('20200101 00:00', '20200101 23:59', freq='min'), columns=['dt'])
    df['intHM'] = df.dt.dt.hour*100 + df.dt.dt.minute
    ls_inthm = np.repeat(df.loc[(df.intHM>=h_min*100) & (df.intHM<=h_max*100), 'intHM'].tolist(), 2*res_thld+1)
    ls_inthm_nextXmin = []
    for i, dt, inthm in df[(df.intHM>=h_min*100) & (df.intHM<=h_max*100)].itertuples():
        ls_inthm_nextXmin.extend(df.loc[i-res_thld:i+res_thld, 'intHM'].tolist())
    intHM = DataFrame({'intHM':ls_inthm, 'intHMNextXMin':ls_inthm_nextXmin})
    intHM.head(2)


    # In[345]:


    dict_gsid_to_wsid = load_obj(P_GS_Data / 'Shock_Weather' / ('dict_gsid_to_wsid_'+weather_type+'.pkl'))


    # In[346]:


    shock = read_parquet(P_GS_Data_Raw / 'Weather' / (weather_type+'_shocks.parquet')).rename(columns={'StID':'WSID'})
    shock = shock[shock[shock_col]==1].copy()
    shock = shock[shock.WSID.isin(list(dict_gsid_to_wsid.values()))].copy()
    shock.head(2)


    # In[347]:


    date = read_parquet(P_GS_Data_Raw / 'Weather' / 'date.parquet')
    date['intYMD'] = date.index.values//10000
    date['intHM'] = date.index.values%10000
    date.head(2)


    # In[348]:


    shock['intYMD'] = shock.YMDHM.map(date.intYMD.to_dict())
    shock['intHM'] = shock.YMDHM.map(date.intHM.to_dict())
    shock.head(2)


    # In[349]:


    gb = shock.groupby(['intYMD','WSID'])[shock_col].sum()
    dict_ymd_to_dict_wsid_to_nshock = {ymd:gb.get(ymd).to_dict() for ymd in shock.intYMD.unique()}


    # In[350]:


    def merge_and_save(s):
        ymd = s.intYMD.values[0]
        shock_ymd = merge(s[['WSID','intHM']], intHM).rename(columns={'intHM':'intHMShock','intHMNextXMin':'intHM'})
        shock_ymd.to_parquet(P_GS_Data / 'Shock_Weather' /'temp'/(str(ymd)+'.parquet'), compression='brotli', index=False)
        return None

    if shock_type == 'any':
        # for large data
        P_Tmp = Path(P_GS_Data / 'Shock_Weather' /'temp')
        [f.unlink() for f in P_Tmp.glob('*')]
        shock.groupby('intYMD')[['WSID','intYMD','intHM']].apply(merge_and_save)
    else:
        shock = merge(shock[['WSID','intYMD','intHM']], intHM).rename(columns={'intHM':'intHMShock','intHMNextXMin':'intHM'})
        gb_intYMD_to_shock = shock.groupby('intYMD') # given ymd, 


    # In[351]:


    ls = []
    for f in files:
        # f = files[100]
        ymd = int(f.stem)
        if ymd%100==1:
            print(ymd, end=',')
        df = read_hdf(f, 'GS')
        df = df[df[fuel]>0]
        df = df[df[fuel]!=df.groupby('StID')[fuel].shift(1)]
        df['H'] = df.Time.dt.hour
        df = df[(df.H>=h_min) & (df.H<=h_max-1)]
        df['intHM'] = df.Time.dt.hour*100+df.Time.dt.minute
        df['WSID'] = df.StID.map(dict_gsid_to_wsid)
        # Number of PC
        nPC = df.groupby('StID')[fuel].count(); nPC.name = 'nPC'
        # Number of Weather Shock
        if ymd not in dict_ymd_to_dict_wsid_to_nshock:
            nShock = Series(0, index=nPC.index, name='nShock')
            nRes = Series(0, index=nPC.index, name='nRes')
        else:
            nShock = df.groupby('StID').WSID.first().map(dict_ymd_to_dict_wsid_to_nshock[ymd]).fillna(0).astype(int); nShock.name = 'nShock'
            # Number of Responses
            if shock_type == 'any':
                shock_ymd = read_parquet(P_Tmp/(str(ymd)+'.parquet'))
                df = merge(df[['StID','intHM','WSID']], shock_ymd)
            else:
                df = merge(df[['StID','intHM','WSID']], gb_intYMD_to_shock.get_group(ymd)[['WSID','intHMShock','intHM']])
            nRes = df.groupby('StID').intHMShock.nunique(); nRes.name = 'nRes'
        # Merge & Clean
        df = concat([nPC,nShock,nRes], axis=1).fillna(0).astype(int).reset_index()
        # df = df[df.nShock>0].copy()
        df['YMD'] = ymd
        ls.append(df)


    # In[352]:


    df = concat(ls)
    del ls
    df.head(2)


    # In[353]:


    df['y2kW'] = df.YMD.map(dict_intYMD_to_y2kW)
    df = df.groupby(['StID','y2kW'])[['nPC','nShock','nRes']].mean().reset_index()
    df.head(2)


    # In[354]:


    print(df.StID.nunique())
    # remove those w/o any variations
    s = df.groupby('StID').nRes.sum()
    df = df[df.StID.isin(s[s>0].index.tolist())]
    # for each station, cut off the first 0s & last 0s
    def remove_head_tail_zeros(s):
        first_nonzero = s[s.nShock>0].index[0]
        last_nonzero = s[s.nShock>0].index[-1]
        return s.loc[first_nonzero:last_nonzero]
    df = df.groupby('StID').apply(remove_head_tail_zeros).reset_index(drop=True)
    print(df.StID.nunique())
    # Remove those with few observations, i.e. <143 weeks
    nW = df.groupby('StID').y2kW.nunique()
    df = df[df.StID.isin(nW[nW>=143].index.tolist())]
    print(df.StID.nunique())


    # In[355]:


    # Remove those with week gaps
    gb = df.groupby('StID').y2kW
    s = gb.nunique()-1 == (gb.max() - gb.min())
    df = df[df.StID.isin(s[s].index.tolist())]
    print(df.StID.nunique())


    # keep only data between Jan 2016 and December 2018
    df.drop(df[(df.y2kW > 991)].index, inplace=True)
    df.drop(df[(df.y2kW < 836)].index, inplace=True)
    # In[356]:


    version = '-'.join([weather_type, shock_type, ('wi'+str(res_thld)+'m'), fuel])
    print(version)
    f = P_GS_Data / 'break_weather' / (version+'.parquet')
    df.to_parquet(f, compression='brotli', index=False)

    fname = '_'.join([weather_type, shock_type, ('wi'+str(res_thld)+'m'), fuel]) + '.dta'
    print(fname)
    f = P_GS_Data / 'break_weather' /  fname
    df.to_stata(f, write_index=False)



# In[ ]:





# ### Crude Oil BREAKS

# In[4]:


oil_price_freq = '5min'
shock_thld = 'P90'
res_thld = 5


# In[ ]:

# In[26]:



version = 'v4'


# In[27]:
for fuel in fuels:
    

    # Build YM Map: 1020=>[1011,1012,...1020,1021,..,1025]
    df = DataFrame(date_range('20200101 00:00', '20200101 23:59', freq='min'), columns=['dt'])
    df['intHM'] = df.dt.dt.hour*100 + df.dt.dt.minute
    ls_inthm = np.repeat(df.loc[(df.intHM>=h_min*100) & (df.intHM<=h_max*100), 'intHM'].tolist(), 2*res_thld+1)
    ls_inthm_nextXmin = []
    for i, dt, inthm in df[(df.intHM>=h_min*100) & (df.intHM<=h_max*100)].itertuples():
        ls_inthm_nextXmin.extend(df.loc[i-res_thld:i+res_thld, 'intHM'].tolist())
    intHM = DataFrame({'intHM':ls_inthm, 'intHMNextXMin':ls_inthm_nextXmin})
    intHM = intHM[intHM.intHM%100%int(oil_price_freq.replace('min',''))==0]
    intHM.head(2)


    # In[28]:


    print(intHM.loc[intHM.intHM==900, 'intHMNextXMin'].tolist())


    # In[29]:


    f = P_GS_Data / 'Shock_Oil' / ('cruide_oil_' + oil_price_freq + '_drop_' + shock_thld + '.parquet')
    print(f.stem)
    oil = read_parquet(f)
    oil = oil[(oil.intHM>=h_min*100) & (oil.intHM<=h_max*100)].copy()
    oil.head(2)


    # In[30]:


    nShock = oil.groupby('intYMD').intHM.count().reset_index()
    nShock.columns = ['intYMD','nShock']
    nShock.head(2)


    # In[31]:


    oil = merge(oil, intHM)


    # In[32]:


    oil.columns = ['intYMD','Shock','intHM']
    oil['TGap'] = oil.intHM - oil.Shock
    oil.head(2)


    # In[33]:



    # v4: price drop only with nPC as control


    # In[35]:

    # # Price drop only
    ls_nPC = []
    ls_shock = []
    # # for f in files[:10]:
    for f in files:
    # # f = files[186]
        ymd = int(f.stem)
        if ymd%100==1:
            print(ymd, end=',')
        df = read_hdf(f, 'GS')
        df = df[df[fuel]>0]
        # df = df[df[fuel]!=df.groupby('StID')[fuel].shift(1)] # ANY price change
        df = df[df[fuel] - df.groupby('StID')[fuel].shift(1) < 0] # v2: price drop ONLY
        df['H'] = df.Time.dt.hour
        df = df[(df.H>=h_min) & (df.H<=h_max)]
        df['intHM'] = df.Time.dt.hour*100+df.Time.dt.minute
        # Number of PC
        nPC = df.groupby('StID')[fuel].count(); nPC.name = ymd
        df = merge(df, oil.loc[oil.intYMD==ymd, ['Shock','intHM']]).sort_values(['StID','Time'])
        s = df.groupby('StID').Shock.nunique()
        s.name = ymd
        ls_nPC.append(nPC)
        ls_shock.append(s)

    # In[36]:


    nPC = concat(ls_nPC, axis=1).fillna(0).astype(int).stack().reset_index()
    nPC.columns = ['StID','intYMD','nPC']
    nPC.head(2)


    # In[37]:


    # nPC[nPC.StID==16488]


    # In[38]:


    nRes = concat(ls_shock, axis=1).fillna(0).astype(int).stack().reset_index()
    nRes.columns = ['StID','intYMD','nRes']
    nRes.head(2)


    # In[39]:


    df = merge(nPC, nRes, how='outer').fillna(0).astype(int)
    df = merge(df, nShock, how='left').fillna(0).astype(int)
    df = df[df.nShock>0].copy() # only station-days with at least one shock
    df.head(2)


    # In[40]:


    # df[df.StID==16488]


    # In[41]:


    # make it weekly-avg
    df['pctRes'] = df.nRes / df.nShock
    df.head(2)


    # In[42]:


    YMD = read_hdf(P_Research / 'Others' / 'DateTime' / 'utils_dates_YMD.h5', 'utils')
    dict_intYMD_to_y2kW = YMD.set_index('intYMD').y2kW.to_dict()
    df['y2kW'] = df.intYMD.map(dict_intYMD_to_y2kW)
    df.head(2)


    # In[43]:


    df = df.groupby(['StID','y2kW'])[['nRes','nShock','pctRes','nPC']].mean().reset_index()
    df.head(2)


    # In[44]:

    # keep only data between Jan 2016 and December 2018
    df.drop(df[(df.y2kW > 991)].index, inplace=True)
    df.drop(df[(df.y2kW < 836)].index, inplace=True)

    # remove those w/o any variations
    s = df.groupby('StID').nRes.sum()
    df = df[df.StID.isin(s[s>0].index.tolist())]

    # remove those with week gaps
    ls_stid_with_gaps = df.loc[df.nPC==0, 'StID'].unique().tolist()
    df = df[~df.StID.isin(ls_stid_with_gaps)].copy()

    # In[46]:


    # f = P_GS_Data / 'Shock_Oil' / 'response_oil_shock_5min_90pctl.dta'
    fname = '_'.join([fuel,'shock', oil_price_freq, shock_thld, 'response', str(res_thld)+'min', version]) + '.dta'
    print(fname)
    f = P_GS_Data / 'break_oil' /  fname
    # df[df.StID<100].to_stata(f, write_index=False)
    df.to_stata(f, write_index=False)


# In[ ]:



# In[ ]:

# ### PCCount

#fuel = 'e5'
h_min = 7; h_max = 21
fuels = ['e5','e10','diesel']

for fuel in fuels:

    fname ='daily_weekday_nonholiday-7to21-positive_price_only_' + fuel  

    df =  read_hdf(P_GS_Data / 'PCCount' / (fname+'.h5'))

    YMD = read_hdf(P_Research / 'Others' / 'DateTime' / 'utils_dates_YMD.h5', 'utils')
    dict_intYMD_to_y2kW = YMD.set_index('intYMD').y2kW.to_dict()
    df['y2kW'] = df.YMD.map(dict_intYMD_to_y2kW)
    df.head(2)


    df = df.groupby(['StID','y2kW'])[['PCCount']].mean().reset_index()
    df.head(2)

    # keep only data between Jan 2016 and December 2018
    df.drop(df[(df.y2kW > 991)].index, inplace=True)
    df.drop(df[(df.y2kW < 836)].index, inplace=True)

    # remove those w/o any variations
    s = df.groupby('StID').PCCount.sum()
    df = df[df.StID.isin(s[s>0].index.tolist())]

    # remove those with week gaps
    ls_stid_with_gaps = df.loc[df.PCCount==0, 'StID'].unique().tolist()
    df = df[~df.StID.isin(ls_stid_with_gaps)].copy()

    # save df for merging later
    df_pccount = df

    fname = 'weekly_nPCCount_'+fuel+'.dta'
    print(fname)
    f = P_GS_Data / 'break_pccount' /  fname
    df.to_stata(f, write_index=False)




# In[ ]:


# ### PCResponse


fuels = ['e5','e10','diesel']
# fuel = 'e10'
#fuel = 'e5'
#fuel = 'diesel'
h_min = 7; h_max = 21
# h_min = 0; h_max = 6

for fuel in fuels:
    
    df = read_hdf(P_GS_Data / 'PCResponse' / ('rival_diffbrand_daily_weekday_nonholiday_'+fuel+'.h5'))


    YMD = read_hdf(P_Research / 'Others' / 'DateTime' / 'utils_dates_YMD.h5', 'utils')
    dict_intYMD_to_y2kW = YMD.set_index('intYMD').y2kW.to_dict()
    df['y2kW'] = df.YMD.map(dict_intYMD_to_y2kW)
    df.head(2)


    df = df.groupby(['StID','y2kW'])[['Response']].mean().reset_index()
    df.head(2)

    # keep only data between Jan 2016 and December 2018
    df.drop(df[(df.y2kW > 991)].index, inplace=True)
    df.drop(df[(df.y2kW < 836)].index, inplace=True)

    # remove those w/o any variations
    s = df.groupby('StID').Response.sum()
    df = df[df.StID.isin(s[s>0].index.tolist())]

    # remove those with week gaps
    ls_stid_with_gaps = df.loc[df.Response==0, 'StID'].unique().tolist()
    df = df[~df.StID.isin(ls_stid_with_gaps)].copy()

    # merge PCCount data
    df = pandas.merge(df, df_pccount, on=["StID", "y2kW"])

    fname = 'weekly_nPCResponse_'+fuel+'.dta'
    print(fname)
    f = P_GS_Data / 'break_pcresponse' /  fname
    df.to_stata(f, write_index=False)


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




